# General packages
library(tidyverse)
library(janitor)
library(plotly)
library(RColorBrewer)
# Packages for cluster analysis:
library(NbClust)
library(cluster)
library(factoextra)
library(dendextend)
library(ggdendro)
# Packages for text mining/sentiment analysis/word cloud
library(pdftools)
library(tidytext)
library(wordcloud)
iris_nice <- iris %>%
clean_names() # this replaces all names with this_kind_of_thing
ggplot(iris_nice) +
geom_point(aes(x = petal_length, y = petal_width, color = species))
# without "color = Species" then this might look like 2 clusters. keep that in mind
ggplot(iris_nice) +
geom_point(aes(x = sepal_length, y = sepal_width, color = species))
How many clusters do YOU Think should exist, R?
number_est <- NbClust(iris_nice[1:4], min.nc = 2, max.nc = 10, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 10 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
# columns 1-4, minimum number and max number of clusters to consider, methond kmenas
Although 10 say 2, we’re going to use 3 because it makes the most sense with WHAT WE KNOW
So now we perform k-means
iris_km <- kmeans(iris_nice[1:4], 3)
iris_km$size
## [1] 62 38 50
# 62, 38, 50 <- number of observations in each category
iris_km$centers
## sepal_length sepal_width petal_length petal_width
## 1 5.901613 2.748387 4.393548 1.433871
## 2 6.850000 3.073684 5.742105 2.071053
## 3 5.006000 3.428000 1.462000 0.246000
# shows multivariate center location associated with those three clusters
iris_km$cluster
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2
## [106] 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2
## [141] 2 2 1 2 2 2 1 2 2 1
# there is some overlap between 2 + 1
# now we assign points to clusters based on this cluster ^
iris_cl <- data.frame(iris_nice, cluster_no = factor(iris_km$cluster))
# now look at it
ggplot(iris_cl) +
geom_point(aes(x=sepal_length, y = sepal_width, color = cluster_no))
ggplot(iris_cl) +
geom_point(aes(x=petal_length,
y = petal_width,
color = cluster_no,
pch = species)) +
scale_color_brewer(palette = "Set2")
# 3d plot time!
plot_ly(x = iris_cl$petal_length,
y = iris_cl$petal_width,
z = iris_cl$sepal_width,
type = "scatter3d",
color = iris_cl$cluster_no,
symbol = ~iris_cl$species,
marker = list(size = 3),
colors = "Set1")
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode